# 官方教程：10 minutes to pandas
import numpy as np
import pandas as pd

# 序列,通过列表结构创建序列
s=pd.Series([1,3,5,np.nan,6,8])
'''
>>>
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
'''


# 指定索引的pandas序列
pd.Series([1,2,3,4,5,6],index=pd.date_range('20191111',periods=6))
'''
2019-11-11    1
2019-11-12    2
2019-11-13    3
2019-11-14    4
2019-11-15    5
2019-11-16    6
Freq: D, dtype: int64
'''

# 创建dataframe结构，通过numpy和pandas生成时间日期，然后让日期作为dataframe的行索引
dates=pd.date_range('20130101',periods=6)
'''
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
'''

pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
'''
2013-01-01  1.237557  1.232055  1.152159 -0.633595
2013-01-02  1.143512 -1.572256 -0.584669 -1.353370
2013-01-03 -0.096162  1.088554 -0.284958  1.045412
2013-01-04 -0.364149  0.211867  0.291192 -3.402922
2013-01-05  1.052863  0.858672 -0.600463  0.972843
2013-01-06  0.035357 -1.455998  1.536727 -0.515005
'''

# 通过字典结构方式创建dataframe结构
pd.DataFrame({'A':1.,'b':pd.Timestamp('20191111'),'C': pd.Series(1, index=list(range(4)), dtype='float32'),
   'D': np.array([3] * 4, dtype='int32'),'E': pd.Categorical(["test", "train", "test", "train"]),'F': 'foo'})

'''
 A    C  D      E    F          b
0  1.0  1.0  3   test  foo 2019-11-11
1  1.0  1.0  3  train  foo 2019-11-11
2  1.0  1.0  3   test  foo 2019-11-11
3  1.0  1.0  3  train  foo 2019-11-11
'''

# 每一列可以是不同的数据类型,继续用上示例，加上dtypes
pd.DataFrame({'A':1.,'b':pd.Timestamp('20191111'),'C': pd.Series(1, index=list(range(4)), dtype='float32'),
   'D': np.array([3] * 4, dtype='int32'),'E': pd.Categorical(["test", "train", "test", "train"]),'F': 'foo'}).dtypes

'''
A           float64
C           float32
D             int32
E          category
F            object
b    datetime64[ns]
dtype: object
'''

####  数据显示

# 获取首几行行和最后N行,继续使用上示例
pd.DataFrame({'A':1.,'b':pd.Timestamp('20191111'),'C': pd.Series(1, index=list(range(4)), dtype='float32'),
   'D': np.array([3] * 4, dtype='int32'),'E': pd.Categorical(["test", "train", "test", "train"]),'F': 'foo'}).head()

pd.DataFrame({'A':1.,'b':pd.Timestamp('20191111'),'C': pd.Series(1, index=list(range(4)), dtype='float32'),
   'D': np.array([3] * 4, dtype='int32'),'E': pd.Categorical(["test", "train", "test", "train"]),'F': 'foo'}).tail(2)

# 展示行头和列头
pd.DataFrame({'A':1.,'b':pd.Timestamp('20191111'),'C': pd.Series(1, index=list(range(4)), dtype='float32'),
   'D': np.array([3] * 4, dtype='int32'),'E': pd.Categorical(["test", "train", "test", "train"]),'F': 'foo'}).index

pd.DataFrame({'A':1.,'b':pd.Timestamp('20191111'),'C': pd.Series(1, index=list(range(4)), dtype='float32'),
   'D': np.array([3] * 4, dtype='int32'),'E': pd.Categorical(["test", "train", "test", "train"]),'F': 'foo'}).columns

>>>Index(['A', 'C', 'D', 'E', 'F', 'b'], dtype='object')

# pandas简单的统计
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')).describe()
'''
 A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.097469  0.196200  0.043113 -0.230150
std    0.862345  0.874607  0.906919  1.081846
min   -0.864567 -0.838253 -0.815975 -1.580268
25%   -0.644984 -0.381592 -0.604186 -0.580767
50%    0.097889  0.015386 -0.264205 -0.519946
75%    0.737050  0.801106  0.612968  0.008024
max    1.196107  1.435585  1.415217  1.656289
'''

# 数据的转置，对角线呼唤
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')).T
'''
  A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.097469  0.196200  0.043113 -0.230150
std    0.862345  0.874607  0.906919  1.081846
min   -0.864567 -0.838253 -0.815975 -1.580268
25%   -0.644984 -0.381592 -0.604186 -0.580767
50%    0.097889  0.015386 -0.264205 -0.519946
75%    0.737050  0.801106  0.612968  0.008024
max    1.196107  1.435585  1.415217  1.656289
'''


# 行列排序与值排序
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')).sort_index(axis=1,ascending=False)
'''
D         C         B         A
2013-01-01 -0.506631 -0.711397  0.951129  0.808872
2013-01-02 -1.580268 -0.245855 -0.838253 -0.864567
2013-01-03 -0.533260 -0.815975 -0.402034  1.196107
2013-01-04 -0.596602  1.415217  1.435585 -0.325809
2013-01-05  1.656289 -0.282554  0.351039  0.521587
2013-01-06  0.179575  0.899243 -0.320266 -0.751376
'''

pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')).sort_values(by='B')
'''
A         B         C         D
2013-01-02 -0.864567 -0.838253 -0.245855 -1.580268
2013-01-03  1.196107 -0.402034 -0.815975 -0.533260
2013-01-06 -0.751376 -0.320266  0.899243  0.179575
2013-01-05  0.521587  0.351039 -0.282554  1.656289
2013-01-01  0.808872  0.951129 -0.711397 -0.506631
2013-01-04 -0.325809  1.435585  1.415217 -0.596602
'''

# 切片 
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))['A']
# ==  pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')).A
'''
2013-01-01    2.312068
2013-01-02   -0.208624
2013-01-03    1.365805
2013-01-04   -0.512995
2013-01-05   -0.744179
2013-01-06    2.008984
'''

# 行切片  0~3行
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))[0:3]
'''
 A         B         C         D
2013-01-01  2.312068  2.217361  0.066585  1.366576
2013-01-02 -0.208624  1.207005 -1.638712  0.391372
2013-01-03  1.365805  0.881929  0.998715  0.331184
'''

# index切片
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))['2013-01-01':'20130102']
'''
 A         B         C         D
2013-01-01  0.869959  0.651975 -0.237321 -0.455534
2013-01-02  0.110693  0.415179  0.399076  1.355039
'''

# 通过标签切片  label
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')).loc[dates[0]]
'''
A   -0.478690
B    1.285633
C   -0.440323
D    0.850717
Name: 2013-01-01 00:00:00, dtype: float64
'''

# 所有行，指定列切片
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')).loc[:,['A','B']]
'''
A         B
2013-01-01  0.108198  0.792905
2013-01-02  2.843338 -0.072358
2013-01-03 -1.138502 -0.245288
2013-01-04 -0.672375  0.909409
2013-01-05 -0.924174  0.940250
2013-01-06 -0.049732 -0.364417
'''


# 行列同时切片
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')).loc['2013-01-01':'2013-01-03','A':'C']
'''
A         B         C
2013-01-01 -2.055685 -1.118244 -0.258992
2013-01-02  0.808990  0.603688 -0.411567
2013-01-03  1.630196 -0.759704 -1.474778
'''

# 根据位置来切片  position  注意：iloc里面全是数字，没有字符
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')).iloc[3]
'''
A   -0.978261
B   -1.072384
C   -0.126126
D   -0.102939
Name: 2013-01-04 00:00:00, dtype: float64
'''

# 行列切片
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')).iloc[1:3,1:3]
'''
 B         C
2013-01-02  0.503399 -0.203374
2013-01-03 -0.635419  0.894901
'''

# 带判断条件的切片
df=pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df[df.A>0]  # df[df['A']>0]一样效果
'''
 A         B         C         D
2013-01-01  2.312068  2.217361  0.066585  1.366576
2013-01-03  1.365805  0.881929  0.998715  0.331184
2013-01-06  2.008984  0.514919 -0.107510  0.273337
'''

# 缺少数据的情况，pandas把默认把缺失的数据为np.nan,可使用重排列处理缺失的数据
# 增加一列缺失数据
df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
'''
 A         B         C         D   E
2013-01-01 -0.102746  0.089461 -0.124941 -0.284159 NaN
2013-01-02  1.315128 -0.393394 -0.284252 -0.182739 NaN
2013-01-03 -1.023878 -0.453757 -1.438008  0.386673 NaN
2013-01-04 -0.485789  0.786617  0.779347 -0.034394 NaN
'''

df_1=df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df_1.loc[dates[0]:dates[1], 'E'] = 1     # 把前两行的E列数据改为1
'''
 A         B         C         D    E
2013-01-01 -0.102746  0.089461 -0.124941 -0.284159  1.0
2013-01-02  1.315128 -0.393394 -0.284252 -0.182739  1.0
2013-01-03 -1.023878 -0.453757 -1.438008  0.386673  NaN
2013-01-04 -0.485789  0.786617  0.779347 -0.034394  NaN
'''

# 清理掉所有any的缺失数据
df_1.dropna(how='any')
'''
 A         B         C         D    E
2013-01-01 -0.102746  0.089461 -0.124941 -0.284159  1.0
2013-01-02  1.315128 -0.393394 -0.284252 -0.182739  1.0
'''

# 把缺失数据替换成指定的数
df_1.fillna(value=5)
'''
A         B         C         D    E
2013-01-01 -0.102746  0.089461 -0.124941 -0.284159  1.0
2013-01-02  1.315128 -0.393394 -0.284252 -0.182739  1.0
2013-01-03 -1.023878 -0.453757 -1.438008  0.386673  5.0
2013-01-04 -0.485789  0.786617  0.779347 -0.034394  5.0
'''

# 列取平均
df.mean()
'''
A    0.421628
B   -0.151784
C   -0.010698
D    0.066715
dtype: float64
'''

# 行取平均
df.mean(1)
'''
2013-01-01    0.052559
2013-01-02    0.028761
2013-01-03   -0.518177
2013-01-04    0.726541
2013-01-05    0.001617
2013-01-06    0.197488
Freq: D, dtype: float64
'''


# 